United States Car Accident Project

Author

Jason Rappazzo

Published

April 19, 2023

1 Indroduction

1.1 Motivation for the Project - James

1.2 Research Questions -James

1.3 Overview of Modeling Techniques - Me

Binary Response Model

Linear Regression

Lasso Regression

Ridge Regression

Ordinal Logit Regression

Decision Tree

Random Forest

Artificial Neural Network

2 Raw Data

3 Preparing Data For Machine Learning

accident_12var <- accident_raw %>% 
  select(Severity,State, `Temperature(F)`, `Humidity(%)`,
         `Visibility(mi)`, `Wind_Speed(mph)`, Weather_Condition,
         `Precipitation(in)`, Crossing, Junction, Traffic_Signal,
         Sunrise_Sunset)

colnames(accident_12var) <- gsub("\\)|\\%|\\(", ".", colnames(accident_12var))
library(caret)
library(recipes)
library(dplyr)

# Split the data into training and testing sets
set.seed(2)
train_indices <- createDataPartition(accident_12var$Severity, p = 0.8, list = FALSE)
train_set <- accident_12var[train_indices, ]
test_set <- accident_12var[-train_indices, ]


# TRAIN SET

# Make a copy of the train set
copied_traindata <- data.frame(train_set)

# Add an id column to copied_traindata
copied_traindata <- copied_traindata %>% mutate(id = row_number())

# Separate Label from Feature
accident <- select(copied_traindata, -Severity) # drop Severity column
label <- copied_traindata$Severity # select Severity column

# Separate Numerical from Categorical
accident_num <- accident %>% 
  select(id, Temperature.F., Humidity..., Visibility.mi., Wind_Speed.mph., Precipitation.in.)

accident_cat <- accident %>% 
  select(id, State, Weather_Condition, Crossing, Junction, Traffic_Signal, Sunrise_Sunset)

# Define numeric and categorical attributes
num_attribs <- names(accident_num)[2:6]
cat_attribs <- names(accident_cat)[2:7]

# Define preprocessing pipelines
num_pipeline <- recipe(~., data = accident_num) %>%
  step_impute_median(all_numeric(), -has_role("id")) %>%
  step_center(all_numeric(), -has_role("id")) %>%
  step_scale(all_numeric(), -has_role("id"))

cat_pipeline <- recipe(~., data = accident_cat) %>%
  step_dummy(all_nominal())

# Merge the preprocessed numerical and categorical features into a single dataset

accident <- accident %>% rename(Index = id) 

df1 <- mutate(num_pipeline %>% prep() %>% bake(new_data = NULL), join_key = "Index")
df2 <- mutate(cat_pipeline %>% prep() %>% bake(new_data = NULL), join_key = "Index")

accident_prepared <- accident %>% 
  select(-one_of(c(cat_attribs, num_attribs)))

accident_prepared <- cbind(accident_prepared, df1,df2)



accident_prepared <- accident_prepared %>% 
  distinct()

accident_prepared <- select(accident_prepared, -c("Index", "id", "join_key", "id.1", "join_key.1"))






#TEST SET
# Make a copy of the test set
copied_testdata <- data.frame(test_set)

# Add an id column to copied_testdata
copied_testdata <- copied_testdata %>% mutate(id = row_number())

# Separate Label from Feature
accident_test <- select(copied_testdata, -Severity) # drop Severity column
label_test <- copied_testdata$Severity # select Severity column

# Separate Numerical from Categorical
accident_num_test <- copied_testdata %>% 
  select(Temperature.F., Humidity..., Visibility.mi., Wind_Speed.mph., Precipitation.in.)

accident_cat_test <- copied_testdata %>% 
  select(State, Weather_Condition, Crossing, Junction, Traffic_Signal, Sunrise_Sunset)

# Define numeric and categorical attributes
num_attribs <- names(accident_num_test)[1:6]
cat_attribs <- names(accident_cat_test)[1:7]

# Define preprocessing pipelines
num_pipeline <- recipe(~., data = accident_num_test) %>%
  step_impute_median(all_numeric(), -has_role("id")) %>%
  step_center(all_numeric(), -has_role("id")) %>%
  step_scale(all_numeric(), -has_role("id"))

cat_pipeline <- recipe(~., data = accident_cat_test) %>%
  step_dummy(all_nominal())

# Merge the preprocessed numerical and categorical features into a single dataset

copied_testdata <- copied_testdata %>% rename(Index = id) 

df1 <- mutate(num_pipeline %>% prep() %>% bake(new_data = NULL), join_key = "Index")
df2 <- mutate(cat_pipeline %>% prep() %>% bake(new_data = NULL), join_key = "Index")

accident_prepared_test <- accident_test %>% 
  select(-one_of(c(cat_attribs, num_attribs)))

accident_prepared_test <- cbind(accident_prepared_test, df1,df2)



accident_prepared_test <- accident_prepared_test %>% 
  distinct()

accident_prepared_test <- select(accident_prepared_test, -c("id", "join_key", "join_key.1"))

4 Models

4.1 Linear Regression

# Fit the linear regression model
lin_reg <- lm(label ~ ., data = accident_prepared)

# Use the model to predict the response variable using the test data
y_pred <- predict(lin_reg, newdata = accident_prepared_test)

# Calculate the residuals
residuals <- y_pred - label_test

# Calculate the squared errors
squared_errors <- residuals^2

# Calculate the mean squared error
mse <- mean(squared_errors)

# Print the MSE
cat("MSE:", mse)
## MSE: 0.1324914

4.2 Ridge Regression

#ridge regression

library(glmnet)

# Separate the predictor variables from the response variable
y <- label
X <- as.matrix(select(accident_prepared, -label))

# Define the lambda sequence for ridge regression
lambda_seq <- 10^seq(10, -2, length = 100)

# Perform cross-validated ridge regression
ridge_fit <- cv.glmnet(X, y, alpha = 0, lambda = lambda_seq)

# Plot the cross-validation results
plot(ridge_fit)


ridge_coef <- coef(ridge_fit)[-1]

y_pred <- predict(ridge_fit, newx = X)

mse <- mean((y - y_pred)^2)


# Print the MSE
cat("MSE:", mse)
## MSE: 0.1341102

4.3 Lasso Regression

x <- model.matrix(~ ., data = accident_prepared) 
y <- label

# Fit a Lasso regression with cross-validation
lasso_model <- cv.glmnet(x, y, alpha = 1) 

extra_columns <- setdiff(colnames(accident_prepared_test), colnames(accident_prepared))

accident_prepared_test <- accident_prepared_test %>%
                          select(-one_of(extra_columns))


# Predict the response variable using the test data
x_test <- model.matrix(~ ., data = accident_prepared_test) 
y_pred <- predict(lasso_model, newx = x_test)

# Calculate the MSE
mse <- mean((y_pred - label_test)^2)

# Print the MSE
cat("MSE:", mse)
## MSE: 0.134835


plot(lasso_model)

4.4 Random Forest

library(randomForest)

# Train the model
accident_forest <- randomForest(label ~ ., data = accident_prepared, ntree = 10, mtry = 5)

# Make predictions on the test set
pred <- predict(accident_forest, newdata = accident_prepared_test)

# Generate confusion matrix
confusion_matrix <- table(pred, label_test)

# Calculate MSE
mse <- mean((label_test - pred)^2)


print(paste("Mean Squared Error:", mse))
[1] "Mean Squared Error: 0.132744376926526"

5 Results

5.1 Mean Squared Error of all Models

5.2 Linear Regression Results

## 
## Linear Regression Results
## ==========================================================================
##                                                 Severity of Car Accident  
##                                                ---------------------------
##                                                           label           
## --------------------------------------------------------------------------
## Temperature.F.                                    0.002 (-0.001, 0.005)   
## Humidity...                                      0.007*** (0.004, 0.010)  
## Visibility.mi.                                   -0.002 (-0.005, 0.002)   
## Wind_Speed.mph.                                  -0.001 (-0.004, 0.002)   
## Precipitation.in.                                -0.0002 (-0.003, 0.002)  
## Crossing                                       -0.036*** (-0.043, -0.029) 
## Junction                                          0.019 (-0.014, 0.052)   
## Traffic_Signal                                 -0.014*** (-0.021, -0.007) 
## State_AR                                         0.150*** (0.101, 0.199)  
## State_AZ                                       -0.199*** (-0.233, -0.165) 
## State_CA                                       -0.104*** (-0.135, -0.074) 
## State_CO                                         0.554*** (0.506, 0.602)  
## State_CT                                         0.503*** (0.451, 0.555)  
## State_DC                                         0.149*** (0.103, 0.196)  
## State_DE                                         0.375*** (0.308, 0.441)  
## State_FL                                       -0.097*** (-0.127, -0.066) 
## State_GA                                         0.601*** (0.552, 0.650)  
## State_IA                                         0.400*** (0.336, 0.463)  
## State_ID                                          0.006 (-0.042, 0.053)   
## State_IL                                         0.290*** (0.254, 0.327)  
## State_IN                                         0.594*** (0.540, 0.648)  
## State_KS                                          0.112 (-0.007, 0.231)   
## State_KY                                          0.024 (-0.044, 0.092)   
## State_LA                                       -0.119*** (-0.152, -0.086) 
## State_MA                                         0.538*** (0.447, 0.630)  
## State_MD                                         0.193*** (0.157, 0.229)  
## State_ME                                        -0.162* (-0.316, -0.009)  
## State_MI                                         0.084*** (0.048, 0.120)  
## State_MN                                       -0.105*** (-0.138, -0.072) 
## State_MO                                          0.006 (-0.039, 0.050)   
## State_MS                                         0.135*** (0.057, 0.214)  
## State_MT                                       -0.094*** (-0.133, -0.056) 
## State_NC                                         -0.018 (-0.049, 0.014)   
## State_ND                                        -0.119* (-0.220, -0.019)  
## State_NE                                         0.311*** (0.215, 0.407)  
## State_NH                                         0.617*** (0.477, 0.758)  
## State_NJ                                         0.271*** (0.233, 0.310)  
## State_NM                                         -0.101 (-0.245, 0.044)   
## State_NV                                         0.196*** (0.112, 0.280)  
## State_NY                                         0.157*** (0.124, 0.190)  
## State_OH                                          0.030 (-0.010, 0.069)   
## State_OK                                         -0.008 (-0.050, 0.035)   
## State_OR                                       -0.053*** (-0.084, -0.021) 
## State_PA                                          0.012 (-0.020, 0.043)   
## State_RI                                          0.109 (-0.092, 0.311)   
## State_SC                                       -0.098*** (-0.130, -0.067) 
## State_SD                                          0.078 (-0.105, 0.261)   
## State_TN                                       -0.087*** (-0.120, -0.053) 
## State_TX                                       -0.093*** (-0.125, -0.062) 
## State_UT                                       -0.057*** (-0.094, -0.021) 
## State_VA                                         0.071*** (0.038, 0.103)  
## State_VT                                        -0.236** (-0.428, -0.043) 
## State_WA                                         0.242*** (0.202, 0.281)  
## State_WI                                         1.430*** (1.339, 1.520)  
## State_WV                                          0.066 (-0.005, 0.136)   
## Weather_Condition_Blowing.Dust...Windy            0.023 (-0.457, 0.503)   
## Weather_Condition_Blowing.Snow...Windy           -0.175 (-0.815, 0.466)   
## Weather_Condition_Clear                           0.273 (-0.062, 0.607)   
## Weather_Condition_Cloudy                          0.015 (-0.212, 0.241)   
## Weather_Condition_Cloudy...Windy                  0.028 (-0.201, 0.258)   
## Weather_Condition_Drizzle                         0.136 (-0.110, 0.381)   
## Weather_Condition_Drizzle.and.Fog                -0.068 (-0.444, 0.307)   
## Weather_Condition_Fair                            0.020 (-0.207, 0.246)   
## Weather_Condition_Fair...Windy                    0.052 (-0.176, 0.280)   
## Weather_Condition_Fog                             0.007 (-0.220, 0.234)   
## Weather_Condition_Fog...Windy                    -0.007 (-0.340, 0.327)   
## Weather_Condition_Freezing.Drizzle               -0.183 (-0.823, 0.457)   
## Weather_Condition_Haze                            0.010 (-0.217, 0.238)   
## Weather_Condition_Haze...Windy                    0.057 (-0.202, 0.315)   
## Weather_Condition_Heavy.Drizzle                  -0.057 (-0.470, 0.356)   
## Weather_Condition_Heavy.Rain                      0.008 (-0.223, 0.238)   
## Weather_Condition_Heavy.Rain...Windy              0.196 (-0.099, 0.492)   
## Weather_Condition_Heavy.Sleet                     0.030 (-0.610, 0.670)   
## Weather_Condition_Heavy.Snow                      0.038 (-0.213, 0.290)   
## Weather_Condition_Heavy.Snow...Windy              0.017 (-0.463, 0.498)   
## Weather_Condition_Heavy.T.Storm                  -0.008 (-0.241, 0.226)   
## Weather_Condition_Heavy.T.Storm...Windy          -0.165 (-0.467, 0.138)   
## Weather_Condition_Light.Drizzle                   0.040 (-0.192, 0.271)   
## Weather_Condition_Light.Drizzle...Windy          0.659*** (0.308, 1.010)  
## Weather_Condition_Light.Freezing.Drizzle         -0.054 (-0.404, 0.297)   
## Weather_Condition_Light.Freezing.Rain             0.252 (-0.050, 0.554)   
## Weather_Condition_Light.Freezing.Rain...Windy    0.871*** (0.496, 1.247)  
## Weather_Condition_Light.Ice.Pellets              0.814*** (0.334, 1.294)  
## Weather_Condition_Light.Rain                      0.029 (-0.198, 0.255)   
## Weather_Condition_Light.Rain...Windy              0.057 (-0.177, 0.292)   
## Weather_Condition_Light.Rain.Shower               0.143 (-0.233, 0.519)   
## Weather_Condition_Light.Rain.with.Thunder        -0.007 (-0.238, 0.223)   
## Weather_Condition_Light.Snow                      0.067 (-0.161, 0.294)   
## Weather_Condition_Light.Snow...Windy              0.048 (-0.194, 0.291)   
## Weather_Condition_Light.Snow.and.Sleet           -0.011 (-0.651, 0.629)   
## Weather_Condition_Light.Snow.Shower              -0.023 (-0.663, 0.618)   
## Weather_Condition_Light.Thunderstorms.and.Rain    0.613 (-0.028, 1.253)   
## Weather_Condition_Mist                            0.123 (-0.162, 0.408)   
## Weather_Condition_Mostly.Cloudy                   0.006 (-0.221, 0.232)   
## Weather_Condition_Mostly.Cloudy...Windy           0.022 (-0.208, 0.252)   
## Weather_Condition_N.A.Precipitation              -0.091 (-0.345, 0.163)   
## Weather_Condition_Overcast                       0.498*** (0.263, 0.734)  
## Weather_Condition_Partly.Cloudy                   0.016 (-0.211, 0.242)   
## Weather_Condition_Partly.Cloudy...Windy          -0.004 (-0.238, 0.230)   
## Weather_Condition_Patches.of.Fog                 -0.003 (-0.255, 0.250)   
## Weather_Condition_Rain                            0.014 (-0.214, 0.242)   
## Weather_Condition_Rain...Windy                    0.036 (-0.231, 0.303)   
## Weather_Condition_Scattered.Clouds               -0.157 (-0.570, 0.256)   
## Weather_Condition_Shallow.Fog                    -0.055 (-0.304, 0.194)   
## Weather_Condition_Showers.in.the.Vicinity        -0.080 (-0.493, 0.334)   
## Weather_Condition_Sleet                          -0.215 (-0.695, 0.265)   
## Weather_Condition_Smoke                           0.131 (-0.099, 0.360)   
## Weather_Condition_Smoke...Windy                  1.000*** (0.520, 1.480)  
## Weather_Condition_Snow                            0.081 (-0.154, 0.316)   
## Weather_Condition_Snow...Windy                   -0.002 (-0.322, 0.318)   
## Weather_Condition_Snow.and.Sleet                 -0.119 (-0.759, 0.521)   
## Weather_Condition_T.Storm                         0.025 (-0.205, 0.255)   
## Weather_Condition_T.Storm...Windy                -0.016 (-0.429, 0.397)   
## Weather_Condition_Thunder                         0.011 (-0.219, 0.241)   
## Weather_Condition_Thunder...Windy                -0.025 (-0.315, 0.264)   
## Weather_Condition_Thunder...Wintry.Mix           -0.253 (-0.894, 0.388)   
## Weather_Condition_Thunder.and.Hail                0.004 (-0.636, 0.643)   
## Weather_Condition_Thunder.in.the.Vicinity         0.033 (-0.195, 0.262)   
## Weather_Condition_Widespread.Dust                 0.005 (-0.635, 0.644)   
## Weather_Condition_Widespread.Dust...Windy         0.021 (-0.619, 0.661)   
## Weather_Condition_Wintry.Mix                      0.040 (-0.194, 0.274)   
## Weather_Condition_Wintry.Mix...Windy             -0.008 (-0.648, 0.631)   
## Sunrise_Sunset_Night                             0.012*** (0.006, 0.017)  
## Constant                                         2.096*** (1.867, 2.324)  
## ==========================================================================
## ==========================================================================
## Note:                                          *p<0.1; **p<0.05; ***p<0.01
library(coefplot)
library(broom)


# Extract coefficients and standard errors
coef_df <- tidy(lin_reg, conf.int = TRUE)

# Filter out intercept
coef_df <- coef_df[-1,]

num_coef_df <- coef_df[coef_df$term %in% num_attribs,]
cat_coef_df <- coef_df[grep(".*\\_.*", coef_df$term), ]

# Create plots
plot_num <- ggplot(num_coef_df, aes(x = estimate, y = reorder(term, estimate))) +
  geom_point(size = 2) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  labs(x = "Coefficient Estimate", y = "Variable") +
  ggtitle("Linear Regression Results for Numeric Variables") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))+
  geom_vline(xintercept = 0, linetype = "dashed", color = "red")

plot_num



cat_coef_df1 <- cat_coef_df[1:25,]
cat_coef_df2 <- cat_coef_df[25:50,]
cat_coef_df3 <- cat_coef_df[50:75,]
cat_coef_df4 <- cat_coef_df[75:100,]
cat_coef_df5 <- cat_coef_df[100:125,]



# Create  separate plots
plot_cat1 <- ggplot(cat_coef_df1, aes(x = estimate, y = reorder(term, estimate))) +
  geom_point(size = 2) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  labs(x = "Coefficient Estimate", y = "Variable") +
  ggtitle("Linear Regression Results for Categorical Variables") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))+
  geom_vline(xintercept = 0, linetype = "dashed", color = "red")

plot_cat1


plot_cat2 <- ggplot(cat_coef_df2, aes(x = estimate, y = reorder(term, estimate))) +
  geom_point(size = 2) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  labs(x = "Coefficient Estimate", y = "Variable") +
  ggtitle("Linear Regression Results for Categorical Variables") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))+
  geom_vline(xintercept = 0, linetype = "dashed", color = "red")

plot_cat2


plot_cat3 <- ggplot(cat_coef_df3, aes(x = estimate, y = reorder(term, estimate))) +
  geom_point(size = 2) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  labs(x = "Coefficient Estimate", y = "Variable") +
  ggtitle("Linear Regression Results for Categorical Variables") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))+
  geom_vline(xintercept = 0, linetype = "dashed", color = "red")
plot_cat3


plot_cat4 <- ggplot(cat_coef_df4, aes(x = estimate, y = reorder(term, estimate))) +
  geom_point(size = 2) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  labs(x = "Coefficient Estimate", y = "Variable") +
  ggtitle("Linear Regression Results for Categorical Variables") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))+
  geom_vline(xintercept = 0, linetype = "dashed", color = "red")

plot_cat4


plot_cat5 <- ggplot(cat_coef_df5, aes(x = estimate, y = reorder(term, estimate))) +
  geom_point(size = 2) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
  labs(x = "Coefficient Estimate", y = "Variable") +
  ggtitle("Linear Regression Results for Categorical Variables") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))+
  geom_vline(xintercept = 0, linetype = "dashed", color = "red")

plot_cat5

5.3 Decision Tree Results

varImpPlot(accident_forest, main = "Variable Importance Plot")

6 Discussion

7 Conclusion